# Required Packages
import numpy as np
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima.arima import auto_arima
from datetime import datetime, timedelta
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we use a dataset from Fred Economic Data. In particular, we use Industrial Production (electric and gas utilities) from 1939-01-01 until Now (the last available dataset). This dataset can be obtained here.
The industrial production (IP) index measures the real output of all relevant establishments located in the United States, regardless of their ownership, but not those located in U.S. territories.
# Industrial Production: Electric and gas utilities
Data = pd.read_csv('Data/IPG2211A2N.csv').reset_index(drop = False)
Data = Data.rename(columns={'DATE': 'Date', 'IPG2211A2N': 'Index'})
Data = Data.set_index('Date').drop(columns = ['index'])
# Converting the index column values to datetime
Data.index = pd.to_datetime(Data.index)
display(Data.tail(12).T)
print ('The Number of NaN values = %i' % Data[pd.isnull(Data.Index)].sum()[0])
| Date | 2020-04-01 | 2020-05-01 | 2020-06-01 | 2020-07-01 | 2020-08-01 | 2020-09-01 | 2020-10-01 | 2020-11-01 | 2020-12-01 | 2021-01-01 | 2021-02-01 | 2021-03-01 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Index | 86.2705 | 86.7248 | 98.7536 | 114.7288 | 111.0092 | 97.1503 | 91.6393 | 94.1296 | 115.7566 | 120.9284 | 120.4768 | 98.6848 |
The Number of NaN values = 0
def TimeSeriesPlot(Inp, Feat, Title, yLim = None, Color = 'RoyalBlue'):
fig = px.line(Inp.reset_index(drop = False), x='Date', y=Feat, color_discrete_sequence = [Color])
fig.update_xaxes(rangeslider_visible=True, rangeslider =dict(bgcolor = 'WhiteSmoke'),
rangeselector=dict(bgcolor='WhiteSmoke',
buttons=list([dict(count=1, label='One Month', step='month', stepmode='todate'),
dict(count=6, label='Six Months', step='month', stepmode='todate'),
dict(count=1, label='This Year', step='year', stepmode='todate'),
dict(step='all')])))
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray')
if not yLim == None:
fig.update_yaxes(range=yLim)
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':.5, 'y': .98, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
TimeSeriesPlot(Data, 'Index', 'Industrial Production: Electric and gas utilities', [0, 140])
First, we can use seasonal decompose function (read more about this function here to analyze the Data.
Results = seasonal_decompose(Data, model='multiplicative')
#
Table = pd.concat([Results.observed, Results.trend, Results.seasonal, Results.resid], axis=1)
Table.columns = ['Observed Data', 'The Estimated Trend Component',
'The Estimated Seasonal Component','The Estimated Residuals']
display(Table.dropna().tail(12).style.set_precision(4)\
.bar(subset=['Observed Data'], align='mid', color=['LimeGreen'])\
.bar(subset=['The Estimated Residuals'], align='mid', color=['Tomato']))
Colors = ['ForestGreen', 'OrangeRed', 'SteelBlue', 'Purple']
Lims = [[0, 140], [0, 120], [0.9, 1.15], [0.85, 1.15]]
for c in range(len(Table.columns)):
TimeSeriesPlot(Table, Table.columns[c], Table.columns[c], Lims[c], Colors[c])
del Colors, Lims, c
| Observed Data | The Estimated Trend Component | The Estimated Seasonal Component | The Estimated Residuals | |
|---|---|---|---|---|
| Date | ||||
| 2019-10-01 00:00:00 | 93.9518 | 103.0897 | 0.9370 | 0.9726 |
| 2019-11-01 00:00:00 | 102.1902 | 102.7708 | 0.9567 | 1.0393 |
| 2019-12-01 00:00:00 | 113.0698 | 102.5640 | 1.0459 | 1.0540 |
| 2020-01-01 00:00:00 | 116.0371 | 102.6452 | 1.1092 | 1.0191 |
| 2020-02-01 00:00:00 | 109.6578 | 102.7031 | 1.0541 | 1.0130 |
| 2020-03-01 00:00:00 | 97.8951 | 102.4894 | 1.0058 | 0.9497 |
| 2020-04-01 00:00:00 | 86.2705 | 102.1902 | 0.9366 | 0.9013 |
| 2020-05-01 00:00:00 | 86.7248 | 101.7580 | 0.9231 | 0.9233 |
| 2020-06-01 00:00:00 | 98.7536 | 101.5341 | 0.9752 | 0.9973 |
| 2020-07-01 00:00:00 | 114.7288 | 101.8499 | 1.0299 | 1.0937 |
| 2020-08-01 00:00:00 | 111.0092 | 102.5045 | 1.0408 | 1.0405 |
| 2020-09-01 00:00:00 | 97.1503 | 102.9882 | 0.9856 | 0.9571 |
Furthermore, here we use the auto_arima function that Automatically discovers the optimal order for an ARIMA model for the sake of predictions. Ideally, we are looking for a model that fits the data and has the lost AIC score, AIC stands for the Akaike information criterion. the AIC score gives provides a way to measure the goodness-of-fit of your model.
Model = auto_arima(Data, start_p=1, start_q=1, max_p=3, max_q=3, m=12,
start_P=0, seasonal=True, d=1, D=1, trace=True,
error_action='ignore', suppress_warnings=True, stepwise=True)
print(Back.GREEN + Fore.WHITE + 'The AIC score' + Style.RESET_ALL + ' = %.4f' % Model.aic())
Performing stepwise search to minimize aic
ARIMA(1,1,1)(0,1,1)[12] : AIC=4008.188, Time=0.97 sec
ARIMA(0,1,0)(0,1,0)[12] : AIC=4495.855, Time=0.05 sec
ARIMA(1,1,0)(1,1,0)[12] : AIC=4336.152, Time=0.24 sec
ARIMA(0,1,1)(0,1,1)[12] : AIC=4117.810, Time=0.50 sec
ARIMA(1,1,1)(0,1,0)[12] : AIC=4269.664, Time=0.31 sec
ARIMA(1,1,1)(1,1,1)[12] : AIC=4000.699, Time=1.61 sec
ARIMA(1,1,1)(1,1,0)[12] : AIC=4170.932, Time=0.79 sec
ARIMA(1,1,1)(2,1,1)[12] : AIC=3973.189, Time=1.80 sec
ARIMA(1,1,1)(2,1,0)[12] : AIC=4061.939, Time=1.34 sec
ARIMA(1,1,1)(2,1,2)[12] : AIC=3959.014, Time=4.61 sec
ARIMA(1,1,1)(1,1,2)[12] : AIC=3990.403, Time=2.95 sec
ARIMA(0,1,1)(2,1,2)[12] : AIC=4048.895, Time=3.54 sec
ARIMA(1,1,0)(2,1,2)[12] : AIC=4101.893, Time=4.17 sec
ARIMA(2,1,1)(2,1,2)[12] : AIC=3960.979, Time=5.01 sec
ARIMA(1,1,2)(2,1,2)[12] : AIC=3960.976, Time=5.14 sec
ARIMA(0,1,0)(2,1,2)[12] : AIC=4167.783, Time=2.22 sec
ARIMA(0,1,2)(2,1,2)[12] : AIC=3986.038, Time=3.26 sec
ARIMA(2,1,0)(2,1,2)[12] : AIC=4063.098, Time=3.98 sec
ARIMA(2,1,2)(2,1,2)[12] : AIC=3958.349, Time=8.75 sec
ARIMA(2,1,2)(1,1,2)[12] : AIC=3993.739, Time=7.26 sec
ARIMA(2,1,2)(2,1,1)[12] : AIC=3977.188, Time=2.40 sec
ARIMA(2,1,2)(1,1,1)[12] : AIC=4003.923, Time=3.53 sec
ARIMA(3,1,2)(2,1,2)[12] : AIC=3959.853, Time=10.66 sec
ARIMA(2,1,3)(2,1,2)[12] : AIC=3959.731, Time=11.45 sec
ARIMA(1,1,3)(2,1,2)[12] : AIC=3962.804, Time=8.84 sec
ARIMA(3,1,1)(2,1,2)[12] : AIC=3962.783, Time=5.87 sec
ARIMA(3,1,3)(2,1,2)[12] : AIC=3954.213, Time=13.77 sec
ARIMA(3,1,3)(1,1,2)[12] : AIC=3985.039, Time=13.69 sec
ARIMA(3,1,3)(2,1,1)[12] : AIC=3964.923, Time=9.96 sec
ARIMA(3,1,3)(1,1,1)[12] : AIC=3999.501, Time=8.07 sec
ARIMA(3,1,3)(2,1,2)[12] intercept : AIC=3960.665, Time=25.01 sec
Best model: ARIMA(3,1,3)(2,1,2)[12]
Total fit time: 171.756 seconds
The AIC score = 3954.2126
We can consider the data from the current year as the Test set and the rest of the data as the train set. That is
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
# Train and Test Sets
Split = Data.index[-1] - timedelta(weeks= 12*4)
C = Split.strftime("%Y-%m-%d")
Train = Data.loc[:C]
Test = Data.loc[C:]
del C
Header(Text = 'Train')
display(Train.T)
Header(Text = 'Test', C ='Green')
display(Test.T)
Train ==============================================================================================
| Date | 1939-01-01 | 1939-02-01 | 1939-03-01 | 1939-04-01 | 1939-05-01 | 1939-06-01 | 1939-07-01 | 1939-08-01 | 1939-09-01 | 1939-10-01 | ... | 2019-06-01 | 2019-07-01 | 2019-08-01 | 2019-09-01 | 2019-10-01 | 2019-11-01 | 2019-12-01 | 2020-01-01 | 2020-02-01 | 2020-03-01 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Index | 3.3842 | 3.41 | 3.4875 | 3.5133 | 3.5133 | 3.565 | 3.565 | 3.6167 | 3.72 | 3.72 | ... | 98.4548 | 113.0791 | 111.2697 | 102.0185 | 93.9518 | 102.1902 | 113.0698 | 116.0371 | 109.6578 | 97.8951 |
1 rows × 975 columns
Test ===============================================================================================
| Date | 2020-04-01 | 2020-05-01 | 2020-06-01 | 2020-07-01 | 2020-08-01 | 2020-09-01 | 2020-10-01 | 2020-11-01 | 2020-12-01 | 2021-01-01 | 2021-02-01 | 2021-03-01 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Index | 86.2705 | 86.7248 | 98.7536 | 114.7288 | 111.0092 | 97.1503 | 91.6393 | 94.1296 | 115.7566 | 120.9284 | 120.4768 | 98.6848 |
Now, let's train the model
_ = Model.fit(Train)
Temp = Model.predict(n_periods = len(Test))
Predications = pd.DataFrame(Temp, index = Test.index, columns=['Predicated'])
del Temp
Predications.T
| Date | 2020-04-01 | 2020-05-01 | 2020-06-01 | 2020-07-01 | 2020-08-01 | 2020-09-01 | 2020-10-01 | 2020-11-01 | 2020-12-01 | 2021-01-01 | 2021-02-01 | 2021-03-01 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Predicated | 85.465636 | 89.303402 | 100.120669 | 111.752917 | 110.33197 | 101.384327 | 92.609666 | 97.778402 | 111.477113 | 119.428564 | 107.721391 | 100.407832 |
Temp = Test.copy()
Temp.columns = ['Observed']
Temp = pd.concat([Temp, Predications], axis = 1).reset_index(drop = False)
Temp = pd.melt(Temp, id_vars=['Date'], value_vars=['Observed','Predicated'], var_name='Variable', value_name='Index')
fig = px.line(Temp, x='Date', y='Index', color = 'Variable', color_discrete_sequence = ['OrangeRed', 'SteelBlue'])
fig.update_xaxes(rangeslider_visible=True, rangeslider =dict(bgcolor = 'WhiteSmoke'),
rangeselector=dict(bgcolor='WhiteSmoke', buttons=list([
dict(count=1, label='One Month', step='month', stepmode='todate'),
dict(count=6, label='Six Months', step='month', stepmode='todate'),dict(step='all')])))
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[0, 140],
showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': '<b>' + 'Industrial Production: Electric and gas utilities' + '<b>', 'x':.5,
'y': .98, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()